# -*- coding: utf-8 -*-


# # Build alon.csv (2000 × 62) right inside Colab
# # ================================================================
# # Colab has R.  We use a one‑liner bash+Rscript so *everything*
# # happens in this single notebook cell.
%%bash
Rscript - <<'RSCRIPT'
# # 1) Install the tiny package that contains the matrix
if (!requireNamespace("EMMIXgene", quietly = TRUE)) {
   install.packages("EMMIXgene", repos = "https://cloud.r-project.org")
}
library(EMMIXgene)
# 
# # 2) Load the data frame: 2000 genes (rows) × 62 samples (cols)
data(alon_data)            # gives a data.frame
message("Data shape: ", nrow(alon_data), " × ", ncol(alon_data))
# 
# # 3) Write as plain CSV (row names = gene IDs)
write.csv(alon_data,
           file = "alon.csv",
           row.names = TRUE,
           quote = FALSE)

# # 4) Preview
print(head(alon_data[, 1:6]))
RSCRIPT

# ================================================================
# Cell 2 – covariance, rank‑p selection, perturbation experiment
# ================================================================
!pip -q install --upgrade numpy pandas scipy   # silent upgrade if needed

import numpy as np
import pandas as pd
import scipy.linalg as la

# ----------------------------------------------------------------
# 1.  Load the expression matrix and build the covariance
# ----------------------------------------------------------------
expr = pd.read_csv("alon.csv", index_col=0)              # 2000 × 62
X = expr.to_numpy(dtype=float)
X -= X.mean(axis=1, keepdims=True)                       # center genes
cov = np.cov(X)                                          # symmetric, PSD
n = cov.shape[0]
print(f"Alon covariance matrix: shape={cov.shape}, "
      f"rank={np.linalg.matrix_rank(cov)}")

# ----------------------------------------------------------------
# 2.  Pick p (95 % Frobenius energy) and compute the eigen‑gap δₚ
# ----------------------------------------------------------------
eigvals, eigvecs = la.eigh(cov)                          # ascending
eigvals = eigvals[::-1]                                  # descending
eigvecs = eigvecs[:, ::-1]

cum_energy = np.cumsum(eigvals**2)
p = np.searchsorted(cum_energy, 0.95 * cum_energy[-1]) + 1
sigma_p  = eigvals[p - 1]
sigma_p1 = eigvals[p] if p < n else 0.0
delta_p  = sigma_p - sigma_p1

print("\n=== Energy target 95% → p =", p, "===")
print(f"σ_p   = {sigma_p:.6e}")
print(f"σ_p+1 = {sigma_p1:.6e}")
print(f"δ_p   = {delta_p:.6e}\n")

# Best rank‑p approximation A_p
U_p     = eigvecs[:, :p]
Sigma_p = np.diag(eigvals[:p])
A_p     = U_p @ Sigma_p @ U_p.T

# ----------------------------------------------------------------
# 3.  Perturbation experiment (10 noise levels)
# ----------------------------------------------------------------
def best_rank_p(M: np.ndarray, p: int) -> np.ndarray:
    U, s, Vt = la.svd(M, full_matrices=False)
    return (U[:, :p] * s[:p]) @ Vt[:p, :]

alphas = np.linspace(0.0, delta_p / (4 * np.sqrt(n)), 10)
rng = np.random.default_rng(0)          # reproducible

header = (
    f"{'α':>12}  {'‖Ã_p−A_p‖₂':>14}  "
    f"{'classical':>14}  {'our bound':>14}  {'ratio':>10}"
)
print(header)
print("-" * len(header))

for alpha in alphas:
    E = alpha * rng.standard_normal(size=(n, n))
    norm_E = la.norm(E, 2)

    A_tilde_p = best_rank_p(cov + E, p)
    true_err  = la.norm(A_tilde_p - A_p, 2)

    classical = 2 * (norm_E + sigma_p1)
    our_bound = 7 * norm_E * sigma_p / delta_p
    ratio     = np.inf if true_err == 0 else our_bound / true_err

    print(f"{alpha:12.4e}  {true_err:14.4e}  "
          f"{classical:14.4e}  {our_bound:14.4e}  {ratio:10.3f}")